Important Library¶
In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
Loading dataset¶
In [5]:
df=pd.read_csv(r"C:\Users\DELL\Downloads\diabetes.csv")
exploratory Data Analysis¶
In [13]:
# Display first 10 record of the data
df.head()
Out[13]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
In [14]:
# Display last 10 record of the data
df.tail()
Out[14]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
In [17]:
# Display randomly any number of record of the data
df.sample(5)
Out[17]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 394 | 4 | 158 | 78 | 0 | 0 | 32.9 | 0.803 | 31 | 1 |
| 633 | 1 | 128 | 82 | 17 | 183 | 27.5 | 0.115 | 22 | 0 |
| 248 | 9 | 124 | 70 | 33 | 402 | 35.4 | 0.282 | 34 | 0 |
| 180 | 6 | 87 | 80 | 0 | 0 | 23.2 | 0.084 | 32 | 0 |
| 99 | 1 | 122 | 90 | 51 | 220 | 49.7 | 0.325 | 31 | 1 |
In [19]:
# number of rows and column
df.shape
Out[19]:
(768, 9)
In [21]:
# list types of all columns
df.dtypes
Out[21]:
Pregnancies int64 Glucose int64 BloodPressure int64 SkinThickness int64 Insulin int64 BMI float64 DiabetesPedigreeFunction float64 Age int64 Outcome int64 dtype: object
In [22]:
# findout if the dataset is contain null value or not
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
In [23]:
# Statistical summary
df.describe()
Out[23]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [24]:
df.shape
Out[24]:
(768, 9)
In [27]:
df=df.drop_duplicates()
In [28]:
df.shape
Out[28]:
(768, 9)
In [29]:
# Count the null values
#checking the missing value in any column
#Dispplay number of null values in every column in dataset
df.isnull().sum()
Out[29]:
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
In [30]:
df.columns
Out[30]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
checking the no.of zeros values in dataset¶
In [34]:
print('NO.of zeros value in Glucose',df[df['Glucose']==0].shape[0])
NO.of zeros value in Glucose 5
In [35]:
print('NO.of zeros value in BloodPressure',df[df['BloodPressure']==0].shape[0])
NO.of zeros value in BloodPressure 35
In [36]:
print('NO.of zeros value in SkinThickness',df[df['SkinThickness']==0].shape[0])
NO.of zeros value in SkinThickness 227
In [37]:
print('NO.of zeros value in Insulin',df[df['Insulin']==0].shape[0])
NO.of zeros value in Insulin 374
In [38]:
print('NO.of zeros value is BMI',df[df['BMI']==0].shape[0])
NO.of zeros value is BMI 11
Replace no.of zeroes valurs with mean of that columns¶
In [39]:
df['Glucose']=df['Glucose'].replace(0,df['Glucose'].mean())
print('NO.of zeros value in Glucose',df[df['Glucose']==0].shape[0])
NO.of zeros value in Glucose 0
In [42]:
df['BloodPressure']=df['BloodPressure'].replace(0,df['BloodPressure'].mean())
df['SkinThickness']=df['SkinThickness'].replace(0,df['SkinThickness'].mean())
df['Insulin']=df['Insulin'].replace(0,df['Insulin'].mean())
df['BMI']=df['BMI'].replace(0,df['BMI'].mean())
In [43]:
df.describe()
Out[43]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 121.681605 | 72.254807 | 26.606479 | 118.660163 | 32.450805 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 30.436016 | 12.115932 | 9.631241 | 93.080358 | 6.875374 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 44.000000 | 24.000000 | 7.000000 | 14.000000 | 18.200000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.750000 | 64.000000 | 20.536458 | 79.799479 | 27.500000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 79.799479 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
Data Visualization¶
In [80]:
f, ax = plt.subplots(1, 2, figsize=(10, 5))
df['Outcome'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Outcome')
ax[0].set_ylabel('')
sns.countplot(x='Outcome', data=df, ax=ax[1])
#sns.countplot(y='Outcome', data=df, ax=ax[1])
ax[1].set_title('Outcome')
N, P = df['Outcome'].value_counts()
print('Negative(0): ', N)
print('Positive(1): ', P)
plt.grid()
plt.show()
Negative(0): 500 Positive(1): 268
In [81]:
# Histogramof each feature
df.hist(bins=10,figsize=(10,10))
plt.show()
In [84]:
# scatter plot matix
from pandas.plotting import scatter_matrix
scatter_matrix(df,figsize=(20,20));
In [87]:
# pairplot
sns.pairplot(data=df,hue='Outcome')
plt.show()
Analysing the relationship between the variables¶
Correlation Analysis¶
In [95]:
# get correlation of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(10, 10))
# Plot the heatmap
g = sns.heatmap(corrmat, annot=True, cmap='RdYlGn')
plt.show()
Split the dataframe into x and y¶
In [98]:
target_name='Outcome'
#sperate object for target feature
y=df[target_name]
# Seperate Object for input featyres
X=df.drop(target_name,axis=1)
In [99]:
X.head()
Out[99]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148.0 | 72.0 | 35.000000 | 79.799479 | 33.6 | 0.627 | 50 |
| 1 | 1 | 85.0 | 66.0 | 29.000000 | 79.799479 | 26.6 | 0.351 | 31 |
| 2 | 8 | 183.0 | 64.0 | 20.536458 | 79.799479 | 23.3 | 0.672 | 32 |
| 3 | 1 | 89.0 | 66.0 | 23.000000 | 94.000000 | 28.1 | 0.167 | 21 |
| 4 | 0 | 137.0 | 40.0 | 35.000000 | 168.000000 | 43.1 | 2.288 | 33 |
In [100]:
y.head()
Out[100]:
0 1 1 0 2 1 3 0 4 1 Name: Outcome, dtype: int64
Apply Feature Scaling¶
In [102]:
# Apply Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
SSx=scaler.transform(X)
Train Test split¶
In [103]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(SSx,y,test_size=0.2, random_state=7)
In [104]:
X_train.shape,y_train.shape
Out[104]:
((614, 8), (614,))
In [105]:
X_test.shape,y_test.shape
Out[105]:
((154, 8), (154,))
Build the classification Algorithm¶
- Logistic regression
In [108]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train,y_train)
Out[108]:
LogisticRegression(multi_class='ovr', solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='ovr', solver='liblinear')
- KneighboursClssifier(KNN)
In [109]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
Out[109]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
- Nave-Bayes classfier
In [112]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
Out[112]:
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
- Support Vector Machine (SVM)
In [117]:
from sklearn.svm import SVC
sv=SVC()
sv.fit(X_train,y_train)
Out[117]:
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
- Decision Tree
In [118]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
Out[118]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
- Random Forest
In [119]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(criterion='entropy')
rf.fit(X_train,y_train)
Out[119]:
RandomForestClassifier(criterion='entropy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(criterion='entropy')
Making Prediction¶
- making prediction on test by using logistic Regresiion
In [127]:
X_test.shape
Out[127]:
(154, 8)
In [121]:
lr_pred=lr.predict(X_test)
In [128]:
lr_pred.shape
Out[128]:
(154,)
- making prediction on test by using KNN
In [123]:
knn_pred=knn.predict(X_test)
- making prediction on test by using Naivie Bayes
In [125]:
nb_pred=nb.predict(X_test)
- making prediction on test by using SVM
In [130]:
sv_pred=sv.predict(X_test)
- making prediction on test by using Decision Tree
In [132]:
dt_pred=dt.predict(X_test)
- making prediction on test by using Random Forest
In [134]:
rf_pred=rf.predict(X_test)
Model Evaluation¶
- Train Score and Test Score
In [137]:
from sklearn.metrics import accuracy_score
print("train accuracy of Logistic Regression",lr.score(X_train,y_train)*100)
print("Accuracy (Test) score of LogisticRegression",lr.score(X_test,y_test)*100)
print("Accuracy(Test) score of Logistic Regression",accuracy_score(y_test,lr_pred)*100)
train accuracy of Logistic Regression 77.36156351791531 Accuracy (Test) score of LogisticRegression 77.27272727272727 Accuracy(Test) score of Logistic Regression 77.27272727272727
In [138]:
# knn
print("train accuracy of KNN",knn.score(X_train,y_train)*100)
print("Accuracy (Test) score of KNN",knn.score(X_test,y_test)*100)
print("Accuracy(Test) score of KNN",accuracy_score(y_test,knn_pred)*100)
train accuracy of KNN 81.10749185667753 Accuracy (Test) score of KNN 74.67532467532467 Accuracy(Test) score of KNN 74.67532467532467
In [141]:
#Navie-Bayes
print("train accuracy of Navie Bayes",nb.score(X_train,y_train)*100)
print("Accuracy (Test) score of Navie Bayes",nb.score(X_test,y_test)*100)
print("Accuracy(Test) score of Navie Bayes",accuracy_score(y_test,nb_pred)*100)
train accuracy of Navie Bayes 74.2671009771987 Accuracy (Test) score of Navie Bayes 74.02597402597402 Accuracy(Test) score of Navie Bayes 74.02597402597402
In [142]:
# SVM
print("train accuracy of SVM",sv.score(X_train,y_train)*100)
print("Accuracy (Test) score ofSVM",sv.score(X_test,y_test)*100)
print("Accuracy(Test) score of SVM",accuracy_score(y_test,sv_pred)*100)
train accuracy of SVM 81.92182410423453 Accuracy (Test) score ofSVM 83.11688311688312 Accuracy(Test) score of SVM 83.11688311688312
In [143]:
# Decision Tree
print("train accuracy of Decision Tree",dt.score(X_train,y_train)*100)
print("Accuracy (Test) score ofDecision Tree",dt.score(X_test,y_test)*100)
print("Accuracy(Test) score of Decision Tree",accuracy_score(y_test,dt_pred)*100)
train accuracy of Decision Tree 100.0 Accuracy (Test) score ofDecision Tree 80.51948051948052 Accuracy(Test) score of Decision Tree 80.51948051948052
In [146]:
# Random Forest
print("train accuracy of Random Forest",rf.score(X_train,y_train)*100)
print("Accuracy (Test) score of Random Forest",rf.score(X_test,y_test)*100)
print("Accuracy(Test) score of Random Forest",accuracy_score(y_test,rf_pred)*100)
train accuracy of Random Forest 100.0 Accuracy (Test) score of Random Forest 79.22077922077922 Accuracy(Test) score of Random Forest 79.22077922077922
Confusion Matrix¶
In [148]:
from sklearn.metrics import classification_report,confusion_matrix
# confusion Matrix of Logistic Regression
cm=confusion_matrix(y_test,lr_pred)
cm
Out[148]:
array([[86, 11],
[24, 33]], dtype=int64)
In [151]:
sns.heatmap(confusion_matrix(y_test,lr_pred),annot=True,fmt="d")
Out[151]:
<Axes: >
In [155]:
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
In [156]:
TN,FP,FN,TP
Out[156]:
(86, 11, 24, 33)
In [213]:
# making the confusion matrix of the Logistic Regression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
cm=confusion_matrix(y_test,lr_pred)
print('TN-True Negative {} '.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
TN-True Negative 86 FP-False Positive 11 FN-false Negative 24 TP-True Positive 33 Accuracy rate 77.27272727272727 misclassification Rate:22.727272727272727
In [214]:
77.27272727272727+22.727272727272727
Out[214]:
100.0
In [215]:
import matplotlib.pyplot as plt
import numpy as np
plt.clf()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0', '1']
plt.title('Confusion Matrix of the logistic Regression')
plt.ylabel('Actual (true) values')
plt.xlabel('Predicted values')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN', 'FP'], ['FN', 'TP']]
for i in range(2):
for j in range(2):
plt.text(j, i, str(s[i][j]) + "-" + str(cm[i][j]))
plt.show()
In [216]:
pd.crosstab(y_test,lr_pred,margins=False)
Out[216]:
| col_0 | 0 | 1 |
|---|---|---|
| Outcome | ||
| 0 | 86 | 11 |
| 1 | 24 | 33 |
In [217]:
pd.crosstab(y_test,lr_pred,margins=True)
Out[217]:
| col_0 | 0 | 1 | All |
|---|---|---|---|
| Outcome | |||
| 0 | 86 | 11 | 97 |
| 1 | 24 | 33 | 57 |
| All | 110 | 44 | 154 |
In [218]:
pd.crosstab(y_test,lr_pred,rownames=['Actual values'],colnames=['Predicted values'],margins=True)
Out[218]:
| Predicted values | 0 | 1 | All |
|---|---|---|---|
| Actual values | |||
| 0 | 86 | 11 | 97 |
| 1 | 24 | 33 | 57 |
| All | 110 | 44 | 154 |
Precision(PPV-Positive Predictive Value)¶
In [219]:
TP,FP
Out[219]:
(33, 11)
In [220]:
Precision=TP/(TP+FP)
Precision
Out[220]:
0.75
In [221]:
33/(33+11)
Out[221]:
0.75
In [222]:
# print precision score
precision_Score=TP/float(TP+FP)*100
print('Precision score:{0:0.4f}'.format(precision_Score))
Precision score:75.0000
In [223]:
from sklearn.metrics import precision_score
# Calculating precision score
precision = precision_score(y_test, lr_pred) * 100
print("Precision Score is:", precision)
# Calculating micro average precision score
micro_precision = precision_score(y_test, lr_pred, average='micro') * 100
print("Micro Average Precision Score is:", micro_precision)
# Calculating macro-average precision score
macro_precision = precision_score(y_test, lr_pred, average='macro') * 100
print("Macro Average Precision Score is:", macro_precision)
# Calculating weighted precision score
weighted_precision = precision_score(y_test, lr_pred, average='weighted') * 100
print("Weighted Average Precision Score is:", weighted_precision)
# Calculate precision score on non weighted score
print("precision score on non weighted score is:",precision_score(y_test,lr_pred,average=None)*100)
Precision Score is: 75.0 Micro Average Precision Score is: 77.27272727272727 Macro Average Precision Score is: 76.5909090909091 Weighted Average Precision Score is: 77.00413223140497 precision score on non weighted score is: [78.18181818 75. ]
In [224]:
print('Classification Report of Logistic Regression:\n',classification_report(y_test,lr_pred,digits=4))
Classification Report of Logistic Regression:
precision recall f1-score support
0 0.7818 0.8866 0.8309 97
1 0.7500 0.5789 0.6535 57
accuracy 0.7727 154
macro avg 0.7659 0.7328 0.7422 154
weighted avg 0.7700 0.7727 0.7652 154
Recall(True Positive rate(TRP))¶
In [225]:
recall_score=TP/float(TP+FN)*100
print('recall_score',recall_score)
recall_score 57.89473684210527
In [226]:
TP,FN
Out[226]:
(33, 24)
In [227]:
33/(33+24)
Out[227]:
0.5789473684210527
In [228]:
from sklearn.metrics import recall_score
# Calculate recall score
recall = recall_score(y_test, lr_pred) * 100
# Print recall score
print('Recall or Sensitivity Score:', recall)
Recall or Sensitivity Score: 57.89473684210527
In [229]:
micro_precision = recall_score(y_test, lr_pred, average='micro') * 100
print("Micro Average recall Score is:", micro_precision)
# Calculating macro-average precision score
macro_precision = recall_score(y_test, lr_pred, average='macro') * 100
print("Macro Average recall Score is:", macro_precision)
# Calculating weighted precision score
weighted_precision = recall_score(y_test, lr_pred, average='weighted') * 100
print("Weighted Average recall Score is:", weighted_precision)
# Calculate precision score on non weighted score
print("recall score on non weighted score is:",recall_score(y_test,lr_pred,average=None)*100)
Micro Average recall Score is: 77.27272727272727 Macro Average recall Score is: 73.27726532826912 Weighted Average recall Score is: 77.27272727272727 recall score on non weighted score is: [88.65979381 57.89473684]
In [230]:
print('Classification Report of Logistic Regression:\n',classification_report(y_test,lr_pred,digits=4))
Classification Report of Logistic Regression:
precision recall f1-score support
0 0.7818 0.8866 0.8309 97
1 0.7500 0.5789 0.6535 57
accuracy 0.7727 154
macro avg 0.7659 0.7328 0.7422 154
weighted avg 0.7700 0.7727 0.7652 154
In [231]:
FPR = FP / (FP + TN)*100
# Print False Positive Rate
print("False Positive Rate (FPR) is:{0:0.4f}".format(FPR))
False Positive Rate (FPR) is:11.3402
In [232]:
FP,TN
Out[232]:
(11, 86)
In [233]:
11/(11+86)
Out[233]:
0.1134020618556701
Specificity¶
In [234]:
specificity = tn / (tn + fp)*100
print("Specificity (True Negative Rate) is:{0:0.4f}".format(specificity))
Specificity (True Negative Rate) is:88.6598
F1Score¶
In [235]:
from sklearn.metrics import f1_score
# Calculate F1 score
f1 = f1_score(y_test, lr_pred)*100
# Print F1 score
print("F1 Score is:", f1)
F1 Score is: 65.34653465346535
In [236]:
micro_precision = f1_score(y_test, lr_pred, average='micro') * 100
print("Micro Average f1 Score is:", micro_precision)
# Calculating macro-average precision score
macro_precision = recall_score(y_test, lr_pred, average='macro') * 100
print("Macro Average f1 Score is:", macro_precision)
# Calculating weighted precision score
weighted_precision = f1_score(y_test, lr_pred, average='weighted') * 100
print("Weighted Average f1 Score is:", weighted_precision)
# Calculate precision score on non weighted score
print("f1 score on non weighted score is:",f1_score(y_test,lr_pred,average=None)*100)
Micro Average f1 Score is: 77.27272727272727 Macro Average f1 Score is: 73.27726532826912 Weighted Average f1 Score is: 76.52373933045479 f1 score on non weighted score is: [83.09178744 65.34653465]
Classification report on logistic regression¶
In [237]:
print('Classification Report of Logistic Regression:\n',classification_report(y_test,lr_pred,digits=4))
Classification Report of Logistic Regression:
precision recall f1-score support
0 0.7818 0.8866 0.8309 97
1 0.7500 0.5789 0.6535 57
accuracy 0.7727 154
macro avg 0.7659 0.7328 0.7422 154
weighted avg 0.7700 0.7727 0.7652 154
ROC(Reciever operating characteristic) curve and ROC AUC(Area Under Curve)¶
In [238]:
from sklearn.metrics import roc_auc_score
# Calculate AUC score
auc = roc_auc_score(y_test, lr_pred)
# Print AUC score
print("Area Under the Curve (AUC) is:", auc)
Area Under the Curve (AUC) is: 0.7327726532826913
In [301]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Assuming you have already computed fpr, tpr, and roc_auc
# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve logistic regression')
plt.legend(loc="lower right")
plt.show()
confusion matrix of KNN¶
In [240]:
sns.heatmap(confusion_matrix(y_test,knn_pred),annot=True,fmt="d")
Out[240]:
<Axes: >
In [255]:
# making the confusion matrix of the KNN
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
cm=confusion_matrix(y_test,knn_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of KNN:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
TN-True Negative 82 FP-False Positive 15 FN-false Negative 24 TP-True Positive 33 Accuracy rate 74.67532467532467 misclassification Rate of KNN:25.324675324675322
In [256]:
74.67532467532467+25.324675324675322
Out[256]:
100.0
In [257]:
#classification report of KNN
print('Classification Report of KNN:\n',classification_report(y_test,knn_pred,digits=4))
Classification Report of KNN:
precision recall f1-score support
0 0.7736 0.8454 0.8079 97
1 0.6875 0.5789 0.6286 57
accuracy 0.7468 154
macro avg 0.7305 0.7122 0.7182 154
weighted avg 0.7417 0.7468 0.7415 154
Area Under Curve Of KNN¶
In [258]:
# Area Under Curve
auc = roc_auc_score(y_test, knn_pred)
# Print AUC score
print("Area Under the Curve (AUC) is:", auc)
Area Under the Curve (AUC) is: 0.7121540965816603
In [259]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Assuming you have already computed fpr, tpr, and thresholds
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve of KNN')
plt.legend()
plt.grid()
plt.show()
Confusion matrix of "Naive Bayes"¶
In [260]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming you have already trained your Naive Bayes classifier (nb) and made predictions (nb_pred)
# y_test contains the true labels
# Generate confusion matrix
cm = confusion_matrix(y_test, nb_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of Naive Bayes:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
TN-True Negative 78 FP-False Positive 19 FN-false Negative 21 TP-True Positive 36 Accuracy rate 74.02597402597402 misclassification Rate of Naive Bayes:25.97402597402597
In [261]:
74.02597402597402 +25.97402597402597
Out[261]:
100.0
In [262]:
sns.heatmap(confusion_matrix(y_test,nb_pred),annot=True,fmt="d")
Out[262]:
<Axes: >
Classification Report of Naive Bayes¶
In [263]:
print('Classification Report of KNN:\n',classification_report(y_test,nb_pred,digits=4))
Classification Report of KNN:
precision recall f1-score support
0 0.7879 0.8041 0.7959 97
1 0.6545 0.6316 0.6429 57
accuracy 0.7403 154
macro avg 0.7212 0.7179 0.7194 154
weighted avg 0.7385 0.7403 0.7393 154
Roc AUC Score OF Naive Bayes¶
In [264]:
auc = roc_auc_score(y_test, nb_pred)
# Print AUC score
print("Area Under the Curve (AUC) is:", auc)
Area Under the Curve (AUC) is: 0.7178513293543136
In [300]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Assuming you have already computed fpr, tpr, and thresholds
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve of Naive bayes')
plt.legend()
plt.grid()
plt.show()
confusion matrix of SVM¶
In [270]:
sns.heatmap(confusion_matrix(y_test,sv_pred),annot=True,fmt="d")
Out[270]:
<Axes: >
In [290]:
#making the confusion matrix of svm
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming you have already trained your Naive Bayes classifier (nb) and made predictions (nb_pred)
# y_test contains the true labels
# Generate confusion matrix
cm = confusion_matrix(y_test, sv_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of SVM:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
TN-True Negative 91 FP-False Positive 6 FN-false Negative 20 TP-True Positive 37 Accuracy rate 83.11688311688312 misclassification Rate of SVM:16.883116883116884
In [277]:
# classification report of svm
print('Classification Report of svm:\n',classification_report(y_test,sv_pred,digits=4))
Classification Report of svm:
precision recall f1-score support
0 0.8198 0.9381 0.8750 97
1 0.8605 0.6491 0.7400 57
accuracy 0.8312 154
macro avg 0.8401 0.7936 0.8075 154
weighted avg 0.8349 0.8312 0.8250 154
Roc AUC of Svm¶
In [279]:
from sklearn.metrics import roc_auc_score
auc = round(roc_auc_score(y_test, sv_pred)*100,2)
print("roc_auc_score of svc:", auc)
roc_auc_score of svc: 79.36
In [282]:
fpr, tpr, thresholds = roc_curve(y_test, sv_pred)
plt.plot(fpr, tpr, color='darkorange', label='ROC')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--',label='ROC curve(area=%0.2f)'%auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM Classifier')
plt.legend()
plt.grid()
plt.show()
confusion matrix of Decision tree¶
In [283]:
sns.heatmap(confusion_matrix(y_test,dt_pred),annot=True,fmt="d")
Out[283]:
<Axes: >
In [285]:
#making the confusion matrix of Decision tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Generate confusion matrix
cm = confusion_matrix(y_test, dt_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of decision tree:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
TN-True Negative 82 FP-False Positive 15 FN-false Negative 15 TP-True Positive 42 Accuracy rate 80.51948051948052 misclassification Rate of decision tree:19.480519480519483
In [286]:
# classification report of Decision tree
print('Classification Report of Decision tree:\n',classification_report(y_test,dt_pred,digits=4))
Classification Report of Decision tree:
precision recall f1-score support
0 0.8454 0.8454 0.8454 97
1 0.7368 0.7368 0.7368 57
accuracy 0.8052 154
macro avg 0.7911 0.7911 0.7911 154
weighted avg 0.8052 0.8052 0.8052 154
Roc AUC of Decision tree¶
In [296]:
from sklearn.metrics import roc_auc_score
auc = round(roc_auc_score(y_test, dt_pred)*100,2)
print("roc_auc_score of decision tree:", auc)
roc_auc_score of decision tree: 79.11
In [297]:
fpr, tpr, thresholds = roc_curve(y_test, dt_pred)
plt.plot(fpr, tpr, color='darkorange', label='ROC')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--',label='ROC curve(area=%0.2f)'%auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Decision tree')
plt.legend()
plt.grid()
plt.show()
confusion matrix of random forest¶
In [291]:
sns.heatmap(confusion_matrix(y_test,rf_pred),annot=True,fmt="d")
Out[291]:
<Axes: >
In [292]:
#making the confusion matrix of Decision tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Generate confusion matrix
cm = confusion_matrix(y_test, rf_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of decision tree:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
TN-True Negative 84 FP-False Positive 13 FN-false Negative 19 TP-True Positive 38 Accuracy rate 79.22077922077922 misclassification Rate of decision tree:20.77922077922078
In [293]:
# classification report of random forest
print('Classification Report of Decision tree:\n',classification_report(y_test,rf_pred,digits=4))
Classification Report of Decision tree:
precision recall f1-score support
0 0.8155 0.8660 0.8400 97
1 0.7451 0.6667 0.7037 57
accuracy 0.7922 154
macro avg 0.7803 0.7663 0.7719 154
weighted avg 0.7895 0.7922 0.7896 154
roc auc of random forest¶
In [294]:
from sklearn.metrics import roc_auc_score
auc = round(roc_auc_score(y_test, rf_pred)*100,2)
print("roc_auc_score of random forest:", auc)
roc_auc_score of decision tree: 76.63
In [299]:
fpr, tpr, thresholds = roc_curve(y_test, rf_pred)
plt.plot(fpr, tpr, color='darkorange', label='ROC')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--',label='ROC curve(area=%0.2f)'%auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for random forest')
plt.legend()
plt.grid()
plt.show()
END¶
In [ ]: